This document describes the steps, code, and files used to generate the ~50kb TwinStrand sequencing panel to study the genetics of male infertility. Specifically, we are targeting DNA damage repair genes, known cancer genes, AZF genes, and genes involved in clonal spermatogenesis.
This is the script used to generate a targeted sequencing panel for the spermseq male infertility project. )
Step 1: Copy the gene list from the spreadsheet and save as a 1-column text file (target_genes.txt) Step 2: Run below script to save into kingspeak UNIX environment
## Change working directory
cd ~/git/spermseq/twinstrand_target_gene_set
## Sort and unique
cat target_genes.txt | sort | uniq > target_genes_v1.txt
## Copy to kingspeak directory
scp target_genes_v1.txt u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets
Use the build 38 GTF file to get other identifying information for each gene_name of interest. Look only at genes/regions that are coding sequences (hence grep -w “CDS” in the below command…)
## Define directory
cd ~/spermseq/twinstrand_target
## Download GTF file
wget ftp://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz | zless | grep -w "CDS" > Homo_sapiens.GRCh38.102.CDS.gtf
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets/Homo_sapiens.GRCh38.102.CDS.gtf ~/git/spermseq/twinstrand_target_gene_set/data
## Get the gene_ids of the target genes of interest
cat target_genes_v1.txt | awk '{print "cat Homo_sapiens.GRCh38.102.CDS.gtf | grep \"gene_name \\\""$1"\\\"\""}' | bash | cut -f 9 | awk 'BEGIN{FS = "\"; "} ; {print $0}' | sed 's/\"; /|/g' | sed 's/;//g' | sed 's/ \"/=/g' > target_gene_list_gtf_identifiers.txt
## Copy target_genes_gid_tid_pid.txt file to local
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets/target_gene_list_gtf_identifiers.txt ~/git/spermseq/twinstrand_target_gene_set
Use R to obtain key GTF fields for each gene: gene_name, gene_id, transcript_id, protein_id, and exon_number.
## Load in packages
library(data.table)
library(dplyr)
## Read in the file
df <- read.table("~/git/spermseq/twinstrand_target_gene_set/target_gene_list_gtf_identifiers.txt")
## Go through each row and get the gene_id, gene_name, transcript_id, protein_id, exon_number, and exon_id
x <- df[1,]
ids <- rbindlist(apply(df, 1, function(x) {
fields <- (strsplit(x, split = "|", fixed = TRUE))[[1]]
fields <- fields[grep(paste(c("gene_name", "gene_id", "transcript_id", "protein_id", "exon_number"), collapse = "|"), fields)] %>%
strsplit(., split = "=", fixed = TRUE) %>% unlist()
return(data.table("gene_name"=fields[8],
"gene_id"=fields[2],
"transcript_id"=fields[4],
"protein_id"=fields[10],
"exon_number"=fields[6]))
}))
write.table(x = ids, file = "~/git/spermseq/twinstrand_target_gene_set/target_gene_list_ids.txt", quote = FALSE, sep = "\t", row.names = FALSE)
Obtain GTEx TPM data using gene_ids belonging to the genes of interest.
## Copy the gene list ids file to kingspeak
scp ~/git/spermseq/twinstrand_target_gene_set/target_gene_list_ids.txt u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets
## Download master sample list to identify testis samples
cd ~/spermseq/data/GTEx/samples
wget https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
grep -w -i "testis" GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt | awk '{print $1}' > GTEx_Analysis_v8_Annotations_SampleAttributesDS_TestisOnlySamples_Data.txt
## Download master transcript TPM data
cd ~/spermseq/data/GTEx/transcripts
wget https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz
gunzip GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz
## Obtain GTEx transcript TPM data from genes of interest --> see "Get TPM data from GTEx testis samples"
cd ~/spermseq/data/GTEx/transcripts
head -n 1 GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct > header.temp ## Get the header which contains sample ids
## Run this if the file already exists: rm ~/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct
cat ~/spermseq/twinstrand_targets/target_gene_list_ids.txt | awk -v OFS='\t' '{print $1, $2}' | sort | uniq | grep -v "gene_name" | cut -f 2 | awk '{print "cat $HOME/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct | grep "$1" "}' | bash >> ~/spermseq/data/GTEx/transcripts/testis.data.temp
cat header.temp testis.data.temp > GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct
## Remove files
rm header.temp
rm testis.data.temp
## Copy final file to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct ~/git/spermseq/twinstrand_target_gene_set/data
## Download appris principal isoform data
cd ~/spermseq/data/appris
wget http://apprisws.bioinfo.cnio.es/pub/current_release/datafiles/homo_sapiens/GRCh38/appris_data.principal.txt
## Copy to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/appris/appris_data.principal.txt ~/git/spermseq/twinstrand_target_gene_set/data
[insert image]
knitr::opts_chunk$set(echo=TRUE)
######################################
##### Load in necessary packages #####
######################################
library(data.table)
library(dplyr)
library(ggplot2)
library(beeswarm)
library(ggbeeswarm)
library(reshape)
library(ggpubr)
library(biomaRt)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
##################################
##### Specify file locations #####
##################################
appris_file <- "~/git/spermseq/twinstrand_target_gene_set/data/appris_data.principal.txt"
gtf_file <- "~/git/spermseq/twinstrand_target_gene_set/data/Homo_sapiens.GRCh38.102.CDS.gtf"
GTEx_file <- "~/git/spermseq/twinstrand_target_gene_set/data/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct"
GTEx_samples <- "~/git/spermseq/twinstrand_target_gene_set/data/GTEx_testis_samples.txt"
#########################################
##### Specify the genes of interest #####
#########################################
genes <- c("WT1", "CDC14A", "DMRT1", "KLHL10", "M1AP", "MEI1", "STAG3",
"SYCP2", "SYCP3", "TEX11", "USP26", "PKD1", "AR", "AKT3", "APOA1",
"APC", "ATM", "BRAF", "BRCA1", "BRCA2", "CBL", "CDKN2A", "CTNNB1", "DICER1",
"DNMT3A", "ERCC1", "ESR1", "FANCM", "FGFR2", "FGFR3", "H3-3A", "H3-3B", "HRAS",
"KIT", "KRAS", "LIG4", "MAP2K1", "MAP2K2", "MLH1", "MLH3", "MSH5", "NR5A1", "NF1",
"PMS2", "PPM1D", "PRKACA", "PTCH1", "PTPN11", "RAG1", "RAF1", "RB1", "RET", "SMAD4",
"SOS1", "STAT3", "STK11", "TEX14", "TEX15", "TSHR", "VHL", "XPA", "XRCC1")
moderate_genes <- c("USP26", "TEX14", "SYCP2", "STAG3", "PKD1", "M1AP", "KLHL10", "DMRT1", "CDC14A")
moderate_gene_ids <- c("ENSG00000134588", "ENSG00000121101", "ENSG00000196074", "ENSG00000066923", "ENSG00000008710", "ENSG00000159374", "ENSG00000161594", "ENSG00000137090", "ENSG00000079335")
genes <- genes[-which(genes %in% moderate_genes)]
genes <- c(genes, "USP9Y", "DDX3Y", "PRY", "DAZ1", "DAZ2", "DAZ3", "DAZ4", "DAZL", "CDY1", "CDY1B", "RBM5")
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
source("~/git/spermseq/script/testis_GTEx_samples.R")
testis_GTEx_tpm <- testis_GTEx_samples(GTEx_file = GTEx_file, GTEx_samples = GTEx_samples)
head(testis_GTEx_tpm)
}
See APPRIS for column details… SANITY CHECK: This steps check that each gene of interest has an APPRIS isoform tag. If not, it will obtain the gene_id from biomaRt using the gene_name. No transcript_ids or CCDS_id will be obtained and the ‘status’ column will be flagged with a “minor” APPRIS designation.
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
source("~/git/spermseq/script/appris_manipulation.R")
appris_df <- appris_manipulation(appris_file = appris_file, genes = genes)
head(appris_df)
}
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
source("~/git/spermseq/script/appris_manipulation.R")
appris_df_tpm <- get_appris_tpm(appris_df = appris_df, testis_GTEx_tpm = testis_GTEx_tpm)
appris_df_tpm <- data.table(do.call("rbind", appris_df_tpm)) %>% `colnames<-`(c("gene_name", "gene_id", "transcript_id", "CCDS", "status", "avg_GTEx_TPM"))
appris_df_tpm
## See which appris isoforms did not have GTEx values
#genes[! genes %in% unique(appris_df_tpm$gene_name)]
}
These are transcripts that are not identified as PRINCIPAL isoforms by APPRIS, but have avg TPM values across all testis samples > the minimum value of the APPRIS-defined PRINCIPAL isoforms.
NOTE: The GTEx-only transcript must have an avg TPM > 1 across all testis samples to be included in the dataset
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
source("~/git/spermseq/script/gtex_transcript_tpm_manipulation.R")
appris_gtex_transcript_tpm <- gtex_transcript_tpm_manipulation(appris_df_tpm = appris_df_tpm, testis_GTEx_tpm = testis_GTEx_tpm, moderate_gene_ids = moderate_gene_ids)
# ## Run this command to filter out appris exons not reliably defined as principal exons
# appris_gtex_transcript_tpm <- appris_gtex_transcript_tpm[!(status %in% c("PRINCIPAL:4", "PRINCIPAL:5", "ALTERNATIVE:1", "ALTERNATIVE:2"))]
# file.remove("~/git/spermseq/script/gtf_df.Rdata")
appris_gtex_transcript_tpm
}
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
## This is the longest step, see if the data already exists
# if (file.exists("~/git/spermseq/script/gtf_df.Rdata")) {
# load("~/git/spermseq/script/gtf_df.Rdata")
# }
if (!file.exists("~/git/spermseq/script/gtf_df.Rdata")) {
## Read in the gtf file
gtf <- fread(gtf_file, header=FALSE) %>% .[,c(1,4,5,9)] %>% `colnames<-` (c("chr", "start", "stop", "info"))
## Get exon positions
source("~/git/spermseq/script/gtf_manipulation.R")
gtf_df <- gtf_manipulation(gtf = gtf, df = appris_gtex_transcript_tpm)
head(gtf_df)
save.image("~/git/spermseq/script/gtf_df.Rdata")
## Write output to text file
write.table(x = gtf_df, file = "~/git/spermseq/twinstrand_target_gene_set/output/appris_gtex_gtf_testis.txt",
quote = FALSE, sep = "\t", row.names = FALSE)
}
}
Schematic depicting how different exons across isoforms of a gene are merged.
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
## Output the warning messages of the pfam domain exon analysis
if (file.exists("~/git/spermseq/script/get_pfam_log.txt")) {
file.remove("~/git/spermseq/script/get_pfam_log.txt")
}
## Capture output to a file
sink_file <- file("~/git/spermseq/script/get_pfam_log.txt", open = "wt")
sink(sink_file)
sink(sink_file, type = "message")
## Run the function
source("~/git/spermseq/script/get_pfam.R")
pfam_df <- get_pfam(df = gtf_df)
## Output the data
pfam_df
## Revert output back to the console
sink(type = "message")
sink()
}
Schematic depicting how different exons across isoforms of a gene are merged.
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
source("~/git/spermseq/script/merge_exons.R")
merged_exons <- merge_exons(df = pfam_df)
## Get the total panel space (exonic)
total <- sum(merged_exons$exon_size)
## Calculate proportion of panel space taken up by each gene
merged_exons$proportion_total <- merged_exons$total_nucleotides/total*100
#head(merged_exons)
## Remove exons found below threshold of total proportion of transcripts for a given gene:
source("~/git/spermseq/script/transcript_proportion_filter.R")
merged_exons <- transcript_proportion_filter(merged_exons = merged_exons)
## Write the output file
write.table(x = merged_exons, file = paste0("~/git/spermseq/twinstrand_target_gene_set/output/merged_exons_.txt"),
quote = FALSE, sep = "\t", row.names = FALSE)
}
#summary(merged_exons$exon_size)
# ## Factor the data so that it plots the x-axis in an order
# merged_exons$x_axis <- paste0(merged_exons$gene_name, " (n = " , merged_exons$total_exons, " exons; avg = ", merged_exons$avg_exon_size, ")")
# merged_exons$x_axis <- factor(merged_exons$x_axis, levels = unique(merged_exons$x_axis[order(merged_exons$total_nucleotides, decreasing = TRUE)]))
#
# ## Generate scatter plot --> Exons (y) per gene (x)
# p1 <- ggplot(data = merged_exons) + geom_point(aes(x = x_axis, y = exon_size, color = transcript_num)) +
# theme(axis.ticks.x = element_blank(), axis.text.x = element_text(angle = 75, hjust = 1)) +
# labs(x = "Gene", y = paste0("Exon Size"), color="# of Transcripts") + scale_y_log10() +
# scale_color_gradient(low = "grey", high = "red")
#
# ## Show proportion of panel space taken up by each gene
# p2 <- ggplot(data = merged_exons) + geom_line(aes(x = x_axis, y = proportion_total, group = 1), color = "Blue") +
# theme(axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank()) +
# labs(y = paste0("Proportion of", "\n", "Total (%)", "\n")) +
# annotate(geom = "text", x = 40, y = 5, label = paste0("Total = ", sum(merged_exons$exon_size), " nucleotides")) +
# geom_vline(xintercept = 9.5, linetype = 2, color = "red")
#
# ggarrange(p2, p1, nrow=2, ncol=1, common.legend = TRUE, legend = "right", heights = c(0.5, 2))
## Download [3D Hotspots](http://www.3dhotspots.org/#/home) dataset and manually save as a txt file
wget http://www.3dhotspots.org/files/3d_hotspots.xls
## Download [Cancer Hotspots](https://www.cancerhotspots.org/#/home) MAF file
cd ~/spermseq/data/hotspots
wget http://download.cbioportal.org/cancerhotspots/cancerhotspots.v2.maf.gz
zless cancerhotspots.v2.maf.gz | grep -v "#" | grep -w "protein_coding" > cancerhotspots.v2.maf
zless cancerhotspots.v2.maf.gz | grep -v "#" | head -n 1 > cancerhotspots.v2.maf.header
cat cancerhotspots.v2.maf.header cancerhotspots.v2.maf > cancerhotspots.v2.proteincoding.maf
cat cancerhotspots.v2.proteincoding.maf | cut -f 1,2,5,6,7,38 > cancerhotspots.v2.proteincoding.columns.maf
## Copy to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/hotspots/cancerhotspots.v2.proteincoding.columns.maf ~/git/spermseq/twinstrand_target_gene_set/data
## Download the liftover file to convert hg19 coordinates from MAF file to hg38
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz
gunzip hg19ToHg38.over.chain.gz > hg19ToHg38.over.chain
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/hotspots/hg19ToHg38.over.chain ~/git/spermseq/twinstrand_target_gene_set/data
## Exons with 3d hotspot mutations
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
source("~/git/spermseq/script/format_hotspot.R")
merged_exons_3d_hotspot <- format_3D_hotspot(hotspot_file = "~/git/spermseq/twinstrand_target_gene_set/data/3d_hotspots_gao.txt",
genes = genes, merged_exons = merged_exons)
}
## Exons with snp/onp/ins/del hotspot mutations
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
source("~/git/spermseq/script/format_hotspot.R")
hotspot_file <- "~/git/spermseq/twinstrand_target_gene_set/data/cancerhotspots.v2.proteincoding.columns.maf"
merged_exons_hotspot <- format_hotspot(hotspot_file = hotspot_file, genes = genes, merged_exons_3d_hotspot = merged_exons_3d_hotspot)
merged_exons_hotspot
}
if (file.exists("~/git/spermseq/script/final.Rdata") == FALSE) {
## Load in necessary packages
library(BSgenome.Hsapiens.UCSC.hg38)
library(BSgenome)
library(GenomicRanges)
## Define GetGC function
GetGC <- function(bsgenome, gr){
seqs <- BSgenome::getSeq(bsgenome, gr)
return(as.numeric(Biostrings::letterFrequency(x = seqs, letters = "GC", as.prob = TRUE)))
}
## Get the IRanges
ranges <- IRanges(start = merged_exons_hotspot$start, end = merged_exons_hotspot$stop)
## Get GRanges object
gr <- GRanges(seqnames = paste0("chr", merged_exons_hotspot$chr), ranges=ranges)
## Get the gc content of each sequence
merged_exons_hotspot$gc_content <- GetGC(bsgenome = BSgenome.Hsapiens.UCSC.hg38, gr = gr)
write.table(x = merged_exons_hotspot, file = "~/git/spermseq/twinstrand_target_gene_set/output/merged_exons_final.txt",
sep = "\t", quote = FALSE, row.names = FALSE)
## Save the data
save.image("~/git/spermseq/script/final.Rdata")
}
if (file.exists("~/git/spermseq/script/final.Rdata") == TRUE) {
load("~/git/spermseq/script/final.Rdata")
}
## Filter out exons/rows that have NA values in pfam_id column
merged_exons_pfam <- na.omit(merged_exons_hotspot)
merged_exons_pfam_hotspot <- merged_exons_pfam[hotspot_3d_exon_num > 0 | hotspot_exon_num > 0]
## Calculate proportion of panel space taken up by each gene
total <- sum(merged_exons_pfam_hotspot$exon_size)
merged_exons_pfam_hotspot <- rbindlist(lapply(unique(merged_exons_pfam_hotspot$gene_id), function(x, merged_exons_pfam_hotspot) {
temp <- merged_exons_pfam_hotspot[gene_id == x]
temp$total_nucleotides <- sum(temp$exon_size)
temp$total_exons <- nrow(temp)
temp$transcript_num <- (temp$transcript_id %>% strsplit(., split = " | "))[[1]] %>% unique() %>% length()
temp$avg_exon_size <- round(mean(temp$exon_size), digits = 2)
return(temp)
}, merged_exons_pfam_hotspot=merged_exons_pfam_hotspot))
merged_exons_pfam_hotspot$proportion_total <- merged_exons_pfam_hotspot$total_nucleotides/total*100
source("~/git/spermseq/script/plot_df.R")
p <- plot_df(df = merged_exons_pfam_hotspot, genes = genes)
p
## Filter out exons/rows that have NA values in pfam_id column
merged_exons_pfam <- na.omit(merged_exons_hotspot)
## Calculate proportion of panel space taken up by each gene
total <- sum(merged_exons_pfam$exon_size)
merged_exons_pfam <- rbindlist(lapply(unique(merged_exons_pfam$gene_id), function(x, merged_exons_pfam) {
temp <- merged_exons_pfam[gene_id == x]
temp$total_nucleotides <- sum(temp$exon_size)
temp$total_exons <- nrow(temp)
temp$transcript_num <- (temp$transcript_id %>% strsplit(., split = " | "))[[1]] %>% unique() %>% length()
temp$avg_exon_size <- round(mean(temp$exon_size), digits = 2)
return(temp)
}, merged_exons_pfam=merged_exons_pfam))
merged_exons_pfam$proportion_total <- merged_exons_pfam$total_nucleotides/total*100
source("~/git/spermseq/script/plot_df.R")
p <- plot_df(df = merged_exons_pfam, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.2]
## Generate plot
source("~/git/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.4]
## Generate plot
source("~/git/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.5]
## Generate plot
source("~/git/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.6]
## Generate plot
source("~/git/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.8]
## Generate plot
source("~/git/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion == 1]
## Generate plot
source("~/git/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p